Импорт данных и библиотек¶

In [1]:
# !pip install geopandas
# !pip install h3
# !pip install folium
# !pip install osmnx
# !pip install geojson
In [2]:
import geopandas as gpd
import numpy as np
import json
import h3
import folium
import osmnx as ox
from shapely import wkt
from folium.plugins import HeatMap
from shapely.geometry import Polygon
from folium.plugins import MarkerCluster, HeatMap
import pandas as pd
from shapely.geometry import Polygon
from geojson import Feature, Point, FeatureCollection, Polygon
import plotly.express as px
from tqdm import tqdm

tqdm.pandas()

interests_df = pd.read_csv("stupino_interests.csv")
locs_df = pd.read_csv("stupino_locs.csv")

Распределение данных по карте¶

Построение диаграммы, сколько в каждом гексагоне карты отметок пользователя

In [3]:
H3_res = 9  # размер гексагона [1 .. 15] чем больше, тем меньше площадь


def geo_to_h3(row):
    return h3.geo_to_h3(lat=row.lat, lng=row.lon, resolution=H3_res)


locs_df['h3_cell'] = locs_df.progress_apply(geo_to_h3, axis=1)
100%|██████████| 10880142/10880142 [04:25<00:00, 41016.59it/s]
In [4]:
locs_df_g = (locs_df
             .groupby('h3_cell')
             .id
             .agg(list)
             .to_frame("ids")
             .reset_index())
# Let's count each points inside the hexagon
locs_df_g['count'] = (locs_df_g['ids']
                      .progress_apply(lambda ignition_ids: len(ignition_ids)))
100%|██████████| 1573/1573 [00:00<00:00, 136458.67it/s]
In [5]:
from shapely.geometry import Polygon

def add_geometry(row):
    points = h3.h3_to_geo_boundary(row['h3_cell'], True)
    return Polygon(points)


#Apply function into our dataframe
locs_df_g['geometry'] = (locs_df_g
                         .progress_apply(add_geometry, axis=1))
100%|██████████| 1573/1573 [00:00<00:00, 3812.77it/s]
In [6]:
def hexagons_dataframe_to_geojson(df_hex, hex_id_field, geometry_field, value_field, file_output=None):
    list_features = []

    for i, row in df_hex.iterrows():
        feature = Feature(geometry=row[geometry_field],
                          id=row[hex_id_field],
                          properties={"value": row[value_field]})
        list_features.append(feature)

    feat_collection = FeatureCollection(list_features)

    if file_output is not None:
        with open(file_output, "w") as f:
            json.dump(feat_collection, f)

    else:
        return feat_collection

geojson_obj = (hexagons_dataframe_to_geojson
               (locs_df_g,
                hex_id_field='h3_cell',
                value_field='count',
                geometry_field='geometry'))
In [72]:
import plotly.express as px

fig = px.choropleth_mapbox(
    locs_df_g,
    geojson=geojson_obj,
    locations='h3_cell',
    color='count',
    color_continuous_scale="Viridis",
    range_color=(0, locs_df_g['count'].mean()),
    mapbox_style='carto-positron',
    zoom=12,
    center={"lat": locs_df.lat.mean(), "lon": locs_df.lon.mean()},
    opacity=0.1,
    labels={'count': 'count of data'})
fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
fig.show("notebook")
In [22]:
interests_df
Out[22]:
marital_status_married marital_status_not_married employment_working employment_not_working availability_of_education_has_a_higher_education availability_of_education_no_higher_education interests_b2b_advertising_and_marketing interests_b2b_raw_materials interests_b2b_equipment_machines_energy_supply interests_b2b_office ... interests_parents_of_toddlers interests_parents_of_preschoolers interests_of_parents_of_primary_school_students interests_parents_of_middle_and_high_school_students interests_business_education age_17 age_55 gender_female gender_male id
0 0 1 1 1 1 1 0 0 0 0 ... 1 0 0 0 0 1 0 1 0 146343
1 0 1 1 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 149957
2 0 0 0 0 1 1 0 0 0 0 ... 0 0 0 0 0 0 0 1 1 78692
3 0 1 1 0 1 1 0 1 0 0 ... 1 0 0 0 1 0 1 1 1 99331
4 0 1 1 0 1 1 0 0 0 0 ... 1 0 1 0 1 0 0 1 1 129854
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
108053 1 0 0 0 1 1 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 90108
108054 0 0 1 0 1 1 0 1 0 0 ... 0 0 0 0 0 0 1 1 0 124168
108055 0 1 0 1 1 1 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 117301
108056 0 1 1 0 1 0 0 1 1 0 ... 0 0 0 0 1 0 0 1 1 104867
108057 0 0 1 0 0 0 0 1 1 0 ... 0 0 0 0 0 1 0 1 1 130525

108058 rows × 106 columns

Возраст¶

In [8]:
ages_df = interests_df[[age for age in interests_df.columns if age.startswith("age_")]]
ages_df.apply(sum).to_frame().style.bar()
Out[8]:
  0
age_18_24 50297
age_25_34 53941
age_35_44 41632
age_45_54 22903
age_17 15314
age_55 27033

Пол¶

In [9]:
gender_df = interests_df[[gender for gender in interests_df.columns if gender.startswith("gender_")]]
gender_df.apply(sum).to_frame().style.bar()
Out[9]:
  0
gender_female 73533
gender_male 73437

Трудоустройство¶

In [10]:
employment_df = interests_df[[user for user in interests_df.columns if user.startswith("employment_")]]
employment_df.apply(sum).to_frame().style.bar()
Out[10]:
  0
employment_working 78881
employment_not_working 26591

Образование¶

In [11]:
interests_df[[user for user in interests_df.columns if user.startswith("availability_of_education_")]].apply(sum).to_frame().style.bar()
Out[11]:
  0
availability_of_education_has_a_higher_education 55189
availability_of_education_no_higher_education 64654

Дети¶

In [12]:
interests_df[[user for user in interests_df.columns if user.startswith("children_")]].apply(sum).to_frame().style.bar()
Out[12]:
  0
children_under_16_there_are_children_in_the_family 71838
children_under_16_no_children_in_the_family 78563

Семейное положение¶

In [13]:
interests_df[[user for user in interests_df.columns if user.startswith("marital_")]].apply(sum).to_frame().style.bar()
Out[13]:
  0
marital_status_married 22707
marital_status_not_married 52332

Доход¶

In [14]:
interests_df[[user for user in interests_df.columns if "individual_income_" in user]].apply(sum).to_frame().style.bar()
Out[14]:
  0
individual_income_a_below_average_income 5453
individual_income_b_average_income 11620
individual_income_c_above_average_income 5440
individual_income_d_high_income 2085
individual_income_e_premium 1262
In [15]:
interests_df[[user for user in interests_df.columns if "household_income_" in user]].apply(sum).to_frame().style.bar()
Out[15]:
  0
household_income_a_below_average 3917
household_income_b_average 50697
household_income_c_above_average 86867

Интересы¶

In [16]:
interests_df[[user for user in interests_df.columns if user.startswith("interests_")]].apply(sum).sort_values(ascending=False).to_frame().style.bar()
Out[16]:
  0
interests_banks_banking_services 61024
interests_new_buildings 58974
interests_car_owners 55978
interests_b2b_documentary_and_financial_and_legal_support 52628
interests_auto_premium_class 51677
interests_medium_and_large_business 51580
interests_renting_residential_property 50192
interests_b2b_it_for_business 50041
interests_auto_middle_class 50041
interests_auto_economy_class 49551
interests_b2b_raw_materials 48303
interests_resale_property 48132
interests_baby_products 47634
interests_interest_in_buying_a_new_car 47208
interests_contributions_and_deposits 46209
interests_mortgage 45086
interests_loans_for_business 44978
interests_credit_cards 44713
interests_interest_in_insurance 44569
interests_consumer_loans 42902
interests_interest_in_buying_a_new_premium_car 42245
interests_auto_insurance 41112
interests_moto 40302
interests_interest_in_buying_a_new_economy_class_car 39955
interests_interest_in_buying_a_mobile_phone 39740
interests_freight_and_commercial_vehicles 39159
interests_auto_parts_and_service 38710
interests_interest_in_buying_a_new_middle_class_car 38656
interests_all_about_children 37114
interests_overseas_real_estate 36566
interests_wedding 35279
interests_b2b_trade_equipment_and_goods_wholesale 34546
interests_used_cars 34170
interests_b2b_medical_equipment 34113
interests_baby_food 32770
interests_b2b_agriculture 32546
interests_mobile_devices 30739
interests_television_and_video_equipment 29597
interests_special_equipment 29147
interests_car_loans 26554
interests_cell_phones_and_headset 25890
interests_auto_suvs 25263
interests_tires_and_wheels 24931
interests_parents_of_toddlers 23065
interests_b2b_office 20957
interests_telecom_operators 20718
interests_quotes_stock_markets 20256
interests_laptops_and_netbooks 19950
interests_microloans 19630
interests_using_online_banking 19531
interests_pregnancy_and_childbirth 19376
interests_learning_languages 18118
interests_parents_of_middle_and_high_school_students 17960
interests_use_of_electronic_money 17907
interests_of_parents_of_primary_school_students 16924
interests_parents_of_newborns 16849
interests_b2b_equipment_machines_energy_supply 16636
interests_mobile_communications_and_internet_access 16358
interests_photo_and_video_cameras 16136
interests_houses_cottages_and_land_plots 15382
interests_education 14821
interests_small_business 14804
interests_audio_engineering 14345
interests_business_education 13252
interests_parents_of_preschoolers 11619
interests_internet_access 11511
interests_childrens_health 11186
interests_finance_and_accounting 11114
interests_commercial_real_estate 9786
interests_basic 8983
interests_legal_support 8964
interests_preschool 8252
interests_b2b_advertising_and_marketing 6442
interests_higher 5035
interests_auto_electronics_and_gps 5010
interests_tablets_and_ereaders 3532
interests_average 2693
interests_tvs 2620
interests_human_resources 877
interests_specialized_secondary 875
interests_active_mobile_internet_users 859

Построение гистограммы интересов внутри гексагона¶

In [69]:
h3_cell = "8911810832fffff"
h3_cell_from_center = "8911817240fffff"
ids = locs_df_g[locs_df_g["h3_cell"] == h3_cell]["ids"][0]
df = interests_df[[user for user in interests_df.columns if user.startswith("interests_") or "id" in user]]
df = df[df.id.isin(ids)]
df.drop("id", axis=1).apply(sum).sort_values(ascending=False).to_frame().style.bar()
Out[69]:
  0
interests_auto_parts_and_service 3
interests_new_buildings 2
interests_b2b_raw_materials 2
interests_banks_banking_services 2
interests_auto_suvs 2
interests_auto_economy_class 2
interests_auto_premium_class 2
interests_car_owners 2
interests_auto_middle_class 2
interests_interest_in_buying_a_new_economy_class_car 2
interests_interest_in_buying_a_new_car 2
interests_tires_and_wheels 2
individual_income_c_above_average_income 1
individual_income_b_average_income 1
interests_moto 1
interests_of_parents_of_primary_school_students 1
interests_mortgage 1
interests_car_loans 1
interests_cell_phones_and_headset 1
interests_auto_insurance 1
interests_parents_of_middle_and_high_school_students 1
interests_b2b_trade_equipment_and_goods_wholesale 1
interests_parents_of_newborns 1
interests_photo_and_video_cameras 1
interests_used_cars 1
interests_interest_in_buying_a_mobile_phone 1
interests_using_online_banking 1
interests_interest_in_buying_a_new_middle_class_car 1
interests_special_equipment 1
interests_freight_and_commercial_vehicles 1
interests_b2b_office 1
interests_b2b_it_for_business 1
individual_income_a_below_average_income 0
interests_credit_cards 0
interests_learning_languages 0
interests_education 0
interests_pregnancy_and_childbirth 0
interests_human_resources 0
interests_medium_and_large_business 0
interests_finance_and_accounting 0
interests_legal_support 0
interests_baby_food 0
individual_income_d_high_income 0
individual_income_e_premium 0
interests_wedding 0
interests_microloans 0
interests_small_business 0
interests_contributions_and_deposits 0
interests_commercial_real_estate 0
interests_parents_of_preschoolers 0
interests_parents_of_toddlers 0
interests_use_of_electronic_money 0
interests_mobile_communications_and_internet_access 0
interests_b2b_advertising_and_marketing 0
interests_higher 0
interests_specialized_secondary 0
interests_b2b_equipment_machines_energy_supply 0
interests_b2b_documentary_and_financial_and_legal_support 0
interests_b2b_medical_equipment 0
interests_childrens_health 0
interests_tvs 0
interests_telecom_operators 0
interests_internet_access 0
interests_renting_residential_property 0
interests_b2b_agriculture 0
interests_active_mobile_internet_users 0
interests_interest_in_buying_a_new_premium_car 0
interests_tablets_and_ereaders 0
interests_laptops_and_netbooks 0
interests_baby_products 0
interests_audio_engineering 0
interests_television_and_video_equipment 0
interests_auto_electronics_and_gps 0
interests_consumer_loans 0
interests_loans_for_business 0
interests_quotes_stock_markets 0
interests_interest_in_insurance 0
interests_houses_cottages_and_land_plots 0
interests_overseas_real_estate 0
interests_resale_property 0
interests_all_about_children 0
interests_mobile_devices 0
interests_preschool 0
interests_basic 0
interests_average 0
interests_business_education 0
In [62]:
 
In [70]:
locs_df_g[locs_df_g["h3_cell"] == h3_cell]
Out[70]:
h3_cell ids count geometry
0 8911810832fffff [12410, 56052, 14802, 33036, 33036, 56052, 124... 17 POLYGON ((37.98722651938006 54.87444152107866,...
In [71]:
df
Out[71]:
interests_b2b_advertising_and_marketing interests_b2b_raw_materials interests_b2b_equipment_machines_energy_supply interests_b2b_office interests_b2b_documentary_and_financial_and_legal_support interests_b2b_medical_equipment interests_b2b_trade_equipment_and_goods_wholesale interests_childrens_health interests_tvs interests_telecom_operators ... interests_credit_cards interests_freight_and_commercial_vehicles interests_special_equipment interests_parents_of_newborns interests_parents_of_toddlers interests_parents_of_preschoolers interests_of_parents_of_primary_school_students interests_parents_of_middle_and_high_school_students interests_business_education id
18322 0 1 0 0 0 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 1277
44439 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 12410
67571 0 1 0 1 0 0 0 0 0 0 ... 0 1 1 1 0 0 1 1 0 56052
105688 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 33036

4 rows × 87 columns

In [ ]: